from PIL import Image, ImageStat, ImageCms
import glob
import os.path, os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline
imgdir = 'images2' # downloaded images
df_1 = pd.read_csv('df_nonLinear.csv').drop(columns='Unnamed: 0')
df_2 = df_1[df_1.Artwork_Image.notna()]
df_2.info()
url_prefix = os.path.commonprefix(df_2.Artwork_Image.tolist()) # replace http://artinfo-images-350.s3.amazonaws.com
df_3 = df_2.assign(Artwork_Image_Path=df_2.Artwork_Image.apply(lambda x: x.replace(url_prefix, imgdir + "/")))
df_3.shape
df_3.head()
# drop rows if Artwork_Image_Path not exists
df_4 = df_3[df_3.Artwork_Image_Path.map(lambda x: os.path.exists(x))]
df_4.shape
def rgb_avg(x):
with Image.open(x) as im:
return map(lambda x: round(x,3), ImageStat.Stat(im).mean)
def as_dataframe(avgs):
return pd.DataFrame(avgs.values.tolist(), index=avgs.index, columns='R G B'.split())
df_5 = df_4.join(as_dataframe(df_4.Artwork_Image_Path.map(rgb_avg)))
df_5.head()
X = np.array(df_5[['R','G','B']])
kmeans = KMeans(n_clusters=8, random_state=0).fit(X)
kmeans
kmeans.n_clusters
kmeans.get_params()
# cluster labels and number of items in them
pd.DataFrame.from_records(np.unique(kmeans.labels_, return_counts=True)).T.rename(
columns={0:'label',1:'count'}).set_index('label')
df_6 = df_5.assign(kmean_cluster=kmeans.labels_)
df_6.to_csv("df_kmean.csv",index=False)
# pick random images for each label
n_imgs = 8
samples = dict(map(lambda label: (label,
df_6[df_6.kmean_cluster==label][['R','G','B',
'Artwork_Image_Path',
'kmean_cluster']].sample(n_imgs)),
df_6.kmean_cluster.unique()))
def render_images(data, label):
grid_rows = 2
grid_cols = data.shape[0]/grid_rows
f, axarr = plt.subplots(grid_rows, grid_cols, figsize=(15,10))
i = 0
for r in range(grid_rows):
for c in range(grid_cols):
img = data.iloc[i]
with Image.open(img.Artwork_Image_Path) as x:
ax = axarr[r, c]
ax.imshow(x)
ax.axis('off')
ax.set_title("RGB[{},{},{}] {}\n{}".format(
img.R.astype(int),
img.G.astype(int),
img.B.astype(int),
img.kmean_cluster,
img.Artwork_Image_Path.replace(imgdir+"/",'')))
i = i + 1
_ = plt.suptitle("label={}".format(label))
render_images(samples[0],0)
render_images(samples[1],1)
render_images(samples[2],2)
render_images(samples[3],3)
render_images(samples[4],4)
render_images(samples[5],5)
render_images(samples[6],6)
render_images(samples[7],7)